; 01 Try DI primary vec. bx=4 bp=8, si stays 100h (or secondary vec).

org 100h        ; assume al=0 bx=0 sp=di=-2 si=0100h bp=09??h; last 16 bytes of PSP = 0
   ; dw -16     ; F0FF40C2: lock inc word [bx+si-0x3e]
   db 0x84,0xC2 ;=test dl,al
PN equ $-4*5    ; dd 0.0, -1.0, /*0.0*/  ; fallthrough
RO equ $-4*3    ; dd 0.0, 0.0, -66.1
I:mov word[byte PN+4+2 + si-100h],0xBF80

;Video mode + palette: 4 bits orange * 4 bits blue. Uses default index 0 (black).
  pusha
  mov al,13h
P:int 10h       ; set video mode | set palette index: bx=i dh=R ch=G cl=B
  inc bx
  mov al,bl
  aam 16        ; ax = ....rrrr....bbbb
BIG equ $-1
  imul dx,ax,4
  mov ax,1010h
  mov cx,dx
  add ch,cl
  shr ch,1      ; ch=G = (R+B)/2
  jnz P         ; dx=cx=0 bx=100h ax=1010h
  popa

  dec di        ; di = pixel address = -3

M:mov dx,0xA000-10-20-20-4;  ; visible pixels are A0000..AF9FF: want X=0 Y=0 in the center of the screen
  mov es,dx     ; dx:bx=YX:XX = es:0     must be neighbors after PUSHA

X:   ;cx=T di=adr_pixel(init=0) bp=09?? si=0100 ah=0   ; cf=0
  inc dx
X2:
  fninit        ; adr:     -18 -16 -14 -12 -10  -8  -6  -4  -2
  pusha         ; stack:    di  si  bp  sp  bx  dx  cx  ax   0
  xor bx,bx     ; s16:  pixadr 100 9??  -2  ..X..Y  T (result)
  fild word[byte BIG + si-100h] ; Z=27408
  fild word[bx-8]  ; Y
  fild word[bx-9]  ; X   |rD.x rD.y rD.z

  mov di,104h
  pusha
  mov bx,bp
  call STORE_DOT    ;V=rD (unnormalized);  |rD*rD
  fsqrt
  fld1
  fdivrp st1        ;|rsqrt(rD*rD)
  call LOAD_SCALE   ;|rd.x rd.y rd.z

PD equ $+1 ;=-18
  mov bx,PN-0x100
  call STORE_DOT    ;V=rd   |D=pn*rd
  mov bp,RO-0x100
  call DOT
  popa

  fisubr word[byte PD + si-100h]   ;|N=pd-pn*ro D
  fdivrp            ;|t
  ftst              ;set cf if t<0 (no hit)
  fnstsw ax
  sahf
  call LOAD_SCALE   ;|t*rd.x t*rd.y t*rd.z

                    ; h = {ro + rd*t}; we need only x and z (+ no need to add ro)
  fstp st1
  fistp word[bp+si] ;V = s16(h.x)
  fiadd word[bx-6]
  fistp word[bx-4] ; pushed_ax = s16(h.z + T)
  popa
  jb  S
  xor al,[bp+si]
  and al,8
  db 0xA9 ; skip 2 bytes (=test ax,NN)
S:mov al,0xF0
  stosb
  add bx,0xCCCD ;dx:bx = YXX += 0000CCCD
  jnc X2
  jnz X   ;do 65536 iterations

  inc cx  ; T++
  in al,60h
  dec al
  jnz M
 ;ret     ; fallthrough

;LOAD: fld1 | call
LOAD_SCALE: ; v3* bp+si, |k --> k*x k*y k*z
  fld dword[bp+di]
  fmul st1           ;|ky k
  fld dword[bp+si+8]
  fmul st2
  fxch st2           ;|k ky kz
  fmul dword[bp+si]  ;|kx ky kz
  ret

;STORE: call | fstp st0
STORE_DOT: ; v3* bp+si, |x y z -->
  fstp dword[bp+si]
  fstp dword[bp+di]
  fstp dword[bp+si+8]

DOT:  ; v3* bp+si, v3* bx+si, | --> ax*bx+ay*by+az*bz
  fld dword[bp+si]
  fmul dword[bx+si]
  fld dword[bp+di]
  fmul dword[bx+di]
  faddp
  fld dword[bp+si+8]
  fmul dword[bx+si+8]
  faddp
  ret
